from dotenv import load_dotenv
load_dotenv("dev.env")
import os
db_name = os.getenv("db_name")
db_username = os.getenv("db_username")
db_password = os.getenv("db_password")
db_host = os.getenv("db_host")
db_port = os.getenv("db_port")
website = os.getenv("website")
city_link = os.getenv("city_link")
chrome_path = os.getenv("chrome_path")
selenium_chrome_driver_path = os.getenv("selenium_chrome_driver_path")
First, we are loading environment variables, this is not required for non-professional projects but it is a good practice to hide your private variables that contains database credentials which might make your server vulnerable.
import psycopg2
import time
import math
import numpy as np
import os.path
import pandas as pd
import re
import pycld2
import nltk
import joblib
import warnings
import urllib.request
import io
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
import keras
import sys
import sweetviz as sv
import webbrowser
import gc
import matplotlib.image as mpimg
from datetime import date, timedelta, datetime
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDRegressor, ElasticNet, BayesianRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC, LinearSVC, SVR
from sklearn.kernel_ridge import KernelRidge
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn import mixture
from sklearn.metrics import accuracy_score, f1_score, precision_score, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from PIL import Image
from keras.utils import np_utils
from keras.layers.core import Dense, Activation, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD, RMSprop
from keras.utils import np_utils
from sklearn import metrics
from sklearn.utils import shuffle
from wordcloud import WordCloud
#from IPython.display import display, Image
nltk.download('stopwords', quiet=True)
stemmer = TurkishStemmer()
conn_string = 'host={pghost} port={pgport} dbname={pgdatabase} user={pguser} password={pgpassword}'.format(pgdatabase=db_name,pguser=db_username,pgpassword=db_password,pghost=db_host,pgport=db_port)
conn=psycopg2.connect(conn_string)
cur=conn.cursor()
options = Options()
options.binary_location = chrome_path
warnings.filterwarnings('ignore')
def check_if_table_exists(schema,table):
cur.execute("select exists(select * from information_schema.tables where table_schema='{schema}' AND table_name='{table}')".format(schema=schema, table=table))
return cur.fetchone()[0]
def check_if_index_exists(index):
cur.execute("SELECT EXISTS(SELECT * FROM PG_CLASS WHERE relname = '{index}')".format(index=index))
return cur.fetchone()[0]
def check_if_file_exists(filename):
return os.path.isfile(filename)
def execute_mogrify(conn, df, schema, table):
tuples = [tuple(x) for x in df.to_numpy()]
cols = '"'+'","'.join(list(df.columns))+'"'
cursor = conn.cursor()
try:
for tup in tuples:
query = """INSERT INTO "{schema}"."{table}"({cols}) VALUES ({values}) ON CONFLICT DO NOTHING""".format(schema=schema,table=table, cols=cols, values=",".join(map(str,tup)))
cursor.execute(query)
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print("Error: %s" % error)
conn.rollback()
cursor.close()
return 1
cursor.close()
def df_column_conversation(df, column_name, type):
if(type == 'timestamp'):
df[column_name] = df[column_name].apply(lambda x: f"'{x}'::timestamp")
if(type == 'text'):
df[column_name] = df[column_name].str.replace("'","").apply(lambda x: f"'{x}'")
if(type == 'date'):
df[column_name] = df[column_name].apply(lambda x: f"'{x}'::date")
if(type == 'numeric'):
df[column_name] = df[column_name].apply(str).str.replace(',','.')
if(type == 'integer'):
df[column_name] = df[column_name].apply(str).str.replace(',','.').apply(float).astype('Int64').apply(str)
def clean_text(text):
text = text.lower()
text = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", text)
return text
def webpage_scroll_down(driver, scroll_speed):
total_height = int(driver.execute_script("return document.body.scrollHeight"))
for i in range(1, total_height, scroll_speed):
driver.execute_script("window.scrollTo(0, {});".format(i))
def byte_image_to_numpy_array(byte_image, image_size):
return cv2.resize(np.array(Image.open(io.BytesIO(byte_image))), (image_size, image_size))
def analyze_data(outputHTML,upToDateCheck,schema,table):
if not((check_if_file_exists(outputHTML) and upToDateCheck)):
sql_command = """
SELECT *
FROM "{schema}"."{table}";
""".format(schema=schema, table=table)
df = pd.read_sql(sql_command,conn)
eda = sv.analyze(df)
eda.show_html(outputHTML, open_browser=False)
del df
gc.collect()
#eda_sales.show_notebook()
webbrowser.open(outputHTML,new=2)
print('You can see the exploratory data analysis visualizations in your browser.')
def analyze_dataframe(outputHTML,upToDateCheck,df):
if not((check_if_file_exists(outputHTML) and upToDateCheck)):
eda = sv.analyze(df)
eda.show_html(outputHTML, open_browser=False)
del df
gc.collect()
#eda_sales.show_notebook()
webbrowser.open(outputHTML,new=2)
print('You can see the exploratory data analysis visualizations in your browser.')
def feature_importance_plot(label, model_name, feature_columns, feature_importance):
plt.title('{label} {model_name} Feature Importance'.format(label=label.title(), model_name=model_name))
plt.barh(feature_columns, model.feature_importances_)
plt.savefig('images/{label}.png'.format(label=label))
plt.close()
def merge_images(image_paths):
images = [Image.open(x) for x in image_paths]
widths, heights = zip(*(i.size for i in images))
total_width = sum(widths)
max_height = max(heights)
merged_im = Image.new('RGB', (total_width, max_height))
x_offset = 0
for im in images:
merged_im.paste(im, (x_offset,0))
x_offset += im.size[0]
return merged_im
def turkish_to_english_characters(text):
tr_en_chars = {'ı':'i', 'ğ':'g', 'İ':'I', 'Ğ':'G', 'ç':'c', 'Ç':'C', 'ş':'s', 'Ş':'S', 'ö':'o', 'Ö':'O', 'ü':'u', 'Ü':'U'}
for tr, en in tr_en_chars.items():
text = text.replace(tr, en)
return text
We are defining our functions and importing libraries which will be used in the next steps.
if(check_if_table_exists('ODS','EXT_FB_RESTAURANT')):
print('Table ODS.EXT_FB_RESTAURANT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_RESTAURANT"
(
"RESTAURANT_ID" text NOT NULL,
"RESTAURANT_NAME" text,
"RESTAURANT_LINK" text,
"DATE" date,
CONSTRAINT "RESTAURANT_ID" UNIQUE ("RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_RESTAURANT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_MENU')):
print('Table ODS.EXT_FB_MENU already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_MENU"
(
"PRODUCT_ID" text NOT NULL,
"RESTAURANT_ID" text,
"CATEGORY_NAME" text,
"PRODUCT_NAME" text,
"PRODUCT_DESCRIPTION" text,
"PRODUCT_LISTED_PRICE" text,
"PRODUCT_PRICE" text,
"DISCOUNT" boolean,
"DESIGN_TYPE" text,
"DATE" date,
CONSTRAINT "PRODUCT_ID" UNIQUE ("PRODUCT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_MENU created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_COMMENT')):
print('Table ODS.EXT_FB_COMMENT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_COMMENT"
(
"RESTAURANT_ID" text,
"USERNAME" text,
"COMMENT_TEXT" text,
"COMMENT_DATE" text,
"SPEED" text,
"SERVING" text,
"FLAVOUR" text,
"DATE" date,
CONSTRAINT "UNIQUE_COMMENTS" UNIQUE ("RESTAURANT_ID", "USERNAME", "COMMENT_TEXT")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_COMMENT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_PRODUCT_IMAGE')):
print('Table ODS.EXT_FB_PRODUCT_IMAGE already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_PRODUCT_IMAGE"
(
"PRODUCT_ID" text,
"RESTAURANT_ID" text,
"IMAGE_LINK" text,
"DATE" date,
CONSTRAINT "UNIQUE_IMAGES" UNIQUE ("PRODUCT_ID", "RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_PRODUCT_IMAGE created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_PRODUCT_IMAGE_SOURCE')):
print('Table ODS.EXT_FB_PRODUCT_IMAGE_SOURCE already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE"
(
"IMAGE_LINK" text NOT NULL,
"SOURCE" bytea,
CONSTRAINT "UNIQUE_SOURCE" PRIMARY KEY ("IMAGE_LINK")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_PRODUCT_IMAGE_SOURCE created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('ODS','EXT_FB_RESTAURANT_SERVICE')):
print('Table ODS.EXT_FB_RESTAURANT_SERVICE already exists.')
else:
start_time = math.trunc(time.time())
cur.execute(
"""
CREATE TABLE "ODS"."EXT_FB_RESTAURANT_SERVICE"
(
"RESTAURANT_ID" text NOT NULL,
"MINIMUM_AMOUNT" text,
"SERVICE_TIME" text,
"DATE" date,
PRIMARY KEY ("RESTAURANT_ID"),
CONSTRAINT "UNIQUE_SERVICE" UNIQUE ("RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table ODS.EXT_FB_RESTAURANT_SERVICE created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_MENU')):
print('Table EDW.DWH_FB_MENU already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_MENU"
(
"PRODUCT_ID" text NOT NULL,
"RESTAURANT_ID" text,
"CATEGORY_NAME" text,
"PRODUCT_NAME" text,
"PRODUCT_DESCRIPTION" text,
"PRODUCT_LISTED_PRICE" numeric,
"PRODUCT_PRICE" numeric,
"DISCOUNT" boolean,
"DESIGN_TYPE" text,
"DATE" date,
CONSTRAINT "PRODUCT_ID" UNIQUE ("PRODUCT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_MENU created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_COMMENT')):
print('Table EDW.DWH_FB_COMMENT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_COMMENT"
(
"RESTAURANT_ID" text,
"USERNAME" text,
"COMMENT_TEXT" text,
"COMMENT_DATE" date,
"SPEED" integer,
"SERVING" integer,
"FLAVOUR" integer,
"DATE" date,
CONSTRAINT "UNIQUE_COMMENTS" UNIQUE ("RESTAURANT_ID", "USERNAME", "COMMENT_TEXT")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_COMMENT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_RESTAURANT_SERVICE')):
print('Table EDW.DWH_FB_RESTAURANT_SERVICE already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_RESTAURANT_SERVICE"
(
"RESTAURANT_ID" text,
"MINIMUM_AMOUNT" numeric,
"MINIMUM_SERVICE_TIME" integer,
"MAXIMUM_SERVICE_TIME" integer,
"DATE" date,
CONSTRAINT "UNIQUE_SERVICE" PRIMARY KEY ("RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_RESTAURANT_SERVICE created in {execute_time} seconds.".format(execute_time=end_time-start_time))
if(check_if_table_exists('EDW','DWH_FB_RESTAURANT')):
print('Table EDW.DWH_FB_RESTAURANT already exists.')
else:
start_time = math.trunc(time.time())
cur.execute("""
CREATE TABLE "EDW"."DWH_FB_RESTAURANT"
(
"RESTAURANT_ID" text,
"RESTAURANT_NAME" text,
"RESTAURANT_BRAND" text,
"RESTAURANT_DISTRICT" text,
"RESTAURANT_STREET" text,
"RESTAURANT_LINK" text,
"DATE" date,
CONSTRAINT "UNIQUE_RESTAURANT" UNIQUE ("RESTAURANT_ID")
);
""")
cur.execute('COMMIT;')
end_time = math.trunc(time.time())
print("Table EDW.DWH_FB_RESTAURANT created in {execute_time} seconds.".format(execute_time=end_time-start_time))
cur.execute("""
CREATE OR REPLACE FUNCTION public.try_cast(_in text, INOUT _out anyelement)
LANGUAGE 'plpgsql'
AS $BODY$
BEGIN
EXECUTE format('SELECT %L::%s', $1, pg_typeof(_out))
INTO _out;
EXCEPTION WHEN others THEN
-- do nothing: _out already carries default
END
$BODY$;
""")
cur.execute('COMMIT;')
Table ODS.EXT_FB_RESTAURANT already exists. Table ODS.EXT_FB_MENU already exists. Table ODS.EXT_FB_COMMENT already exists. Table ODS.EXT_FB_PRODUCT_IMAGE already exists. Table ODS.EXT_FB_PRODUCT_IMAGE_SOURCE already exists. Table ODS.EXT_FB_RESTAURANT_SERVICE already exists. Table EDW.DWH_FB_MENU already exists. Table EDW.DWH_FB_COMMENT already exists. Table EDW.DWH_FB_RESTAURANT_SERVICE already exists. Table EDW.DWH_FB_RESTAURANT already exists.
Creating tables that will contain our data and creating database functions.
cur.execute("""
WITH DATES AS(
SELECT
MAX("DATE") AS "DATE"
FROM "ODS"."EXT_FB_MENU" EFM
UNION ALL
SELECT
MAX("DATE") AS "DATE"
FROM "ODS"."EXT_FB_COMMENT" EFC
)
SELECT
MAX("DATE") AS "LAST_EXECUTION_DATE"
FROM DATES;
""")
last_execution_date = cur.fetchone()[0]
last_execution_date
datetime.date(2021, 6, 11)
Getting last execution dates in the tables to not import the same data we collected for performance.
restaurant_list = []
end_date = min(date(2021,6,11),(date.today() - timedelta(days=1)))
driver = webdriver.Chrome(options=options, executable_path=selenium_chrome_driver_path)
if(last_execution_date < end_date):
driver.get(city_link)
time.sleep(5)
for i in range(25):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
city_restaurant_groups = driver.find_elements_by_class_name("restaurant-main-info")
for restaurant in city_restaurant_groups:
restaurant_name = restaurant.find_element_by_class_name("restaurant-display-name").text
restaurant_name = restaurant_name.replace("YENİ ", "")
restaurant_link = restaurant.find_element_by_class_name("restaurant-display-name").find_element_by_xpath(".//a").get_attribute('href')
restaurant_id = restaurant_link.split("/")[-1]
if(len(restaurant_link) < 2):
continue
restaurant_list.append([restaurant_id,restaurant_name,restaurant_link])
restaurant_df = pd.DataFrame(restaurant_list, columns=["RESTAURANT_ID","RESTAURANT_NAME","RESTAURANT_LINK"])
df_column_conversation(restaurant_df, 'RESTAURANT_ID', 'text')
df_column_conversation(restaurant_df, 'RESTAURANT_NAME', 'text')
df_column_conversation(restaurant_df, 'RESTAURANT_LINK', 'text')
restaurant_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,restaurant_df,"ODS","EXT_FB_RESTAURANT")
Scraping the website to gather restaurant lists and their links, in this part we need to scroll down to load all the restaurants.
sql_command = """
SELECT
"RESTAURANT_ID"
FROM "ODS"."EXT_FB_RESTAURANT" EFR
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_MENU" EFM WHERE EFR."RESTAURANT_ID" = EFM."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
restaurant_link = "{website}/{sublink}".format(website=website,sublink=sublink)
driver.get(restaurant_link)
time.sleep(5)
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
menu = driver.find_element_by_xpath('//*[@id="restaurant_menu"]')
categories = menu.find_elements_by_xpath('//*[contains(@id,"menu_")]')
menu_list = []
for category in categories:
category_name = category.find_element_by_xpath(".//b").text
for product in category.find_elements_by_xpath(".//div[2]/ul/li"):
try:
design_type = "list"
try:
product_id = product.find_elements_by_class_name("getProductDetail")[-1].get_attribute('data-product-id')
product_name = product.find_elements_by_class_name("getProductDetail")[-1].text
except:
product_id = product.find_element_by_xpath(".//strong").get_attribute('data-product-id')
product_name = product.find_element_by_xpath(".//strong").text
design_type = "card"
try:
product_description = product.find_element_by_class_name("product-desc").text
product_price = product.find_element_by_class_name("price").text
except:
product_description = product.find_element_by_class_name("productInfo").text
product_price = product.find_element_by_class_name("newPrice").text
if(not(design_type=="card")):
design_type = "box"
discount = "TRUE"
try:
if(design_type=="list"):
product_listed_price = product.find_element_by_class_name("listed-price").text
if(design_type in ["card","box"]):
product_listed_price = product.find_element_by_class_name("listedPrice").text
except:
product_listed_price = product_price
discount = "FALSE"
menu_list.append([product_id,sublink,category_name,product_name,product_description,product_listed_price,product_price,discount,design_type])
except:
continue
menu_df = pd.DataFrame(menu_list, columns=["PRODUCT_ID","RESTAURANT_ID","CATEGORY_NAME","PRODUCT_NAME","PRODUCT_DESCRIPTION","PRODUCT_LISTED_PRICE","PRODUCT_PRICE","DISCOUNT","DESIGN_TYPE"])
menu_df = menu_df[menu_df['PRODUCT_ID'].str.len() > 0]
menu_df = menu_df[menu_df['PRODUCT_NAME'].str.len() > 0]
df_column_conversation(menu_df, 'PRODUCT_ID', 'text')
df_column_conversation(menu_df, 'RESTAURANT_ID', 'text')
df_column_conversation(menu_df, 'CATEGORY_NAME', 'text')
df_column_conversation(menu_df, 'PRODUCT_NAME', 'text')
df_column_conversation(menu_df, 'PRODUCT_DESCRIPTION', 'text')
df_column_conversation(menu_df, 'PRODUCT_LISTED_PRICE', 'text')
df_column_conversation(menu_df, 'PRODUCT_PRICE', 'text')
df_column_conversation(menu_df, 'DESIGN_TYPE', 'text')
menu_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,menu_df,"ODS","EXT_FB_MENU")
menu_df
Looping through all of the restaurants and scraping their products.
sql_command = """
SELECT
"RESTAURANT_ID"
FROM "ODS"."EXT_FB_RESTAURANT" EFR
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_COMMENT" EFC WHERE EFR."RESTAURANT_ID" = EFC."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df = restaurant_df.sample(frac=1)
restaurant_df
| RESTAURANT_ID | |
|---|---|
| 58 | ghost-pizza-sisli-tesvikiye-mah-istanbul |
| 29 | tam-tadinda-tantuni-basaksehir-basaksehir-mah-... |
| 46 | mayk-cafe-sisli-fulya-mah-istanbul |
| 9 | fonseca-pizza-besiktas-dikilitas-mah-istanbul |
| 30 | big-slice-pizza-maltepe-baglarbasi-mah-istanbul |
| ... | ... |
| 56 | kurucesme-kahvesi-atasehir-barbaros-mah-istanbul |
| 65 | gardenia-burger-pizza-makarna-sariyer-maslak-m... |
| 40 | nisantasi-22-sisli-harbiye-mah-istanbul |
| 54 | ellisse-beyoglu-omeravni-mah-kabatas-istanbul |
| 71 | caffe-di-dosso-dossi-fatih-aksemsettin-mah-ist... |
79 rows × 1 columns
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
last_comment_page_url = "{website}/{sublink}?section=comments&page=9999".format(website=website,sublink=sublink)
driver.get(last_comment_page_url)
time.sleep(0.1)
comments_list = []
if(sublink not in driver.current_url):
continue
last_comment_page_redirect_url = driver.current_url
last_comment_page_number = int(last_comment_page_redirect_url.replace("&status=closed","").replace("{website}/{sublink}?section=comments&page=".format(website=website,sublink=sublink),""))
for page_number in range(1, last_comment_page_number+1):
current_comment_page_url = "{website}/{sublink}?section=comments&page={page_number}".format(website=website,sublink=sublink,page_number=page_number)
driver.get(current_comment_page_url)
time.sleep(1)
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
try:
driver.find_element(By.XPATH, '//*[@id="alternative-restaurant-popup"]/div[1]/div[2]/img').click(); #Closing pop-up
except Exception:
pass
#driver.find_element(By.XPATH, '//*[@id="restaurantDetail"]/div[2]/div[1]/ul/li[4]/a').click(); #Clicking comments
comment_list = driver.find_elements_by_class_name("comments-body")
for comment in comment_list:
try:
username = comment.find_element_by_class_name("userName").text
comment_text = comment.find_element_by_xpath('.//p').text
comment_date = comment.find_element_by_class_name("commentDate").text
except NoSuchElementException:
continue
try:
speed = comment.find_element_by_class_name("speed").text
except NoSuchElementException:
speed = ""
try:
serving = comment.find_element_by_class_name("serving").text
except NoSuchElementException:
serving = ""
try:
flavour = comment.find_element_by_class_name("flavour").text
except NoSuchElementException:
flavour = ""
comments_list.append([sublink, username, comment_text, comment_date, speed, serving, flavour])
comment_df = pd.DataFrame(comments_list, columns=["RESTAURANT_ID","USERNAME","COMMENT_TEXT","COMMENT_DATE","SPEED","SERVING","FLAVOUR"])
df_column_conversation(comment_df, 'RESTAURANT_ID', 'text')
df_column_conversation(comment_df, 'USERNAME', 'text')
df_column_conversation(comment_df, 'COMMENT_TEXT', 'text')
df_column_conversation(comment_df, 'COMMENT_DATE', 'text')
df_column_conversation(comment_df, 'SPEED', 'text')
df_column_conversation(comment_df, 'SERVING', 'text')
df_column_conversation(comment_df, 'FLAVOUR', 'text')
comment_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,comment_df,"ODS","EXT_FB_COMMENT")
comment_df
Looping through all of the restaurants and scraping their reviews and given scores for each comment.
sql_command = """
SELECT DISTINCT
"RESTAURANT_ID",
"DESIGN_TYPE"
FROM "ODS"."EXT_FB_MENU" EFM
WHERE 1=1
AND EFM."DESIGN_TYPE" <> 'list'
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_PRODUCT_IMAGE" EFPI WHERE EFM."RESTAURANT_ID" = EFPI."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df
| RESTAURANT_ID | DESIGN_TYPE | |
|---|---|---|
| 0 | the-boss-pizza-burger-sariyer-maden-mah-istanbul | card |
| 1 | 444-pizza-bagcilar-demirkapi-mah-istanbul | card |
| 2 | mezzaluna-paket-beykoz-acarlar-mah-acarkent-is... | card |
| 3 | kafein-plus-kucukcekmece-tevfikbey-mah-sefakoy... | card |
| 4 | mayk-cafe-sisli-fulya-mah-istanbul | card |
| ... | ... | ... |
| 208 | amigos-burger-pizza-kagithane-merkez-mah-istanbul | card |
| 209 | konak-firin-avcilar-merkez-mah-istanbul | card |
| 210 | keyifle-pide-lahmacun-bakirkoy-atakoy-6-kisim-... | card |
| 211 | havelka-kadikoy-caddebostan-mah-istanbul | card |
| 212 | heros-pizza-esenyurt-fatih-mah-istanbul | card |
213 rows × 2 columns
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
design_type = restaurant_df.loc[i,"DESIGN_TYPE"]
page_url = "{website}/{sublink}".format(website=website,sublink=sublink)
driver.get(page_url)
time.sleep(0.1)
webpage_scroll_down(driver, 15)
if(sublink not in driver.current_url):
continue
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
try:
driver.find_element(By.XPATH, '//*[@id="alternative-restaurant-popup"]/div[1]/div[2]/img').click(); #Closing pop-up
except Exception:
pass
menu = driver.find_element_by_xpath('//*[@id="restaurant_menu"]')
categories = menu.find_elements_by_xpath('//*[contains(@id,"menu_")]')
image_list = []
for product in menu.find_elements_by_class_name("product-image"):
try:
product_id = product.get_attribute('data-product-id')
image_link = product.find_element_by_xpath(".//img").get_attribute('src')
if("www.yemeksepeti.com/assets/images/" in image_link):
continue
image_list.append([product_id,sublink,image_link])
except:
pass
image_df = pd.DataFrame(image_list, columns=["PRODUCT_ID","RESTAURANT_ID","IMAGE_LINK"])
df_column_conversation(image_df, 'PRODUCT_ID', 'text')
df_column_conversation(image_df, 'RESTAURANT_ID', 'text')
df_column_conversation(image_df, 'IMAGE_LINK', 'text')
image_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,image_df,"ODS","EXT_FB_PRODUCT_IMAGE")
sql_command = """
SELECT DISTINCT
"RESTAURANT_ID"
FROM "ODS"."EXT_FB_MENU" EFM
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_RESTAURANT_SERVICE" EFRS WHERE EFM."RESTAURANT_ID" = EFRS."RESTAURANT_ID");
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df
| RESTAURANT_ID | |
|---|---|
| 0 | adrians-pizza-sultanbeyli-mehmet-akif-mah-ista... |
| 1 | afili-pizza-kadikoy-rasimpasa-mah-istanbul |
| 2 | antica-pizzeria-by-antica-locanda-besiktas-arn... |
| 3 | aron-pizza-atasehir-barbaros-mah-istanbul |
| 4 | ayicik-pizza-tatli-besiktas-dikilitas-mah-ista... |
| 5 | baca-pizzeria-sariyer-maden-mah-istanbul |
| 6 | balcoon-cafe-uskudar-kisikli-mah-istanbul |
| 7 | bee-pizza-esenler-davutpasa-mah-istanbul |
| 8 | big-slice-pizza-maltepe-baglarbasi-mah-istanbul |
| 9 | duses-pizza-esenyurt-ucevler-mah-istanbul |
| 10 | emporio-pizza-pasta-besiktas-levent-mah-istanbul |
| 11 | f-chef-italiano-pizza-pasta-sisli-halaskargazi... |
| 12 | hamurkeyf-fatih-cerrahpasa-mah-istanbul |
| 13 | heros-pizza-besiktas-dikilitas-mah-istanbul |
| 14 | little-caesars-pizza-bayrampasa-yenidogan-mah-... |
| 15 | little-caesars-pizza-fatih-aksaray-mah-istanbul |
| 16 | little-caesars-pizza-sariyer-maden-mah-istanbul |
| 17 | mandalin-mercan-tuzla-postane-mah-istanbul |
| 18 | mistic-pizza-burger-cafe-esenler-havaalani-mah... |
| 19 | panda-pizza-esenyurt-guzelyurt-mah-istanbul |
| 20 | pastavicio-makarna-pizza-kadikoy-caferaga-mah-... |
| 21 | pizza-bulls-sultangazi-ugur-mumcu-mah-istanbul |
| 22 | pizza-hays-cafe-pendik-velibaba-mah-istanbul |
| 23 | pizza-hut-atasehir-kucukbakkalkoy-mah-istanbul |
| 24 | pizza-hut-besiktas-levazim-mah-istanbul |
| 25 | pizza-hut-pendik-yenisehir-mah-kurtkoy-istanbul |
| 26 | pizza-hut-sariyer-ayazaga-mah-vadi-istanbul-bo... |
| 27 | pizza-hut-sisli-esentepe-mah-ozdilek-istanbul |
| 28 | pizza-station-kagithane-seyrantepe-mah-istanbul |
| 29 | pizzeria-eyup-gokturk-merkez-mah-istanbul |
| 30 | sbarro-pizza-atasehir-kucukbakkalkoy-mah-istanbul |
| 31 | sbarro-pizza-bakirkoy-osmaniye-mah-istanbul |
| 32 | sbarro-pizza-basaksehir-ziya-gokalp-mah-istanbul |
| 33 | sbarro-pizza-esenyurt-guzelyurt-mah-istanbul |
| 34 | sbarro-pizza-esenyurt-incirtepe-mah-istanbul |
| 35 | sbarro-pizza-esenyurt-saadetdere-mah-istanbul |
| 36 | sbarro-pizza-eyup-aksemsettin-mah-istanbul |
| 37 | sbarro-pizza-eyup-topcular-mah-istanbul |
| 38 | sbarro-pizza-pendik-guzelyali-mah-istanbul |
| 39 | sbarro-pizza-sisli-esentepe-mah-plazalar-istanbul |
| 40 | sbarro-pizza-umraniye-madenler-mah-istanbul |
| 41 | sbarro-pizza-umraniye-yamanevler-mah-istanbul |
| 42 | sbarro-pizza-uskudar-mimar-sinan-mah-istanbul |
| 43 | sehremini-tas-firini-fatih-sehremini-mah-capa-... |
| 44 | sembol-pasta-pide-pizza-esenler-menderes-mah-i... |
| 45 | torcello-ristorante-italiano-besiktas-akat-mah... |
#if(last_execution_date < end_date):
if(last_execution_date < end_date):
for i in range(len(restaurant_df)):
sublink = restaurant_df.loc[i,"RESTAURANT_ID"]
restaurant_link = "{website}/{sublink}".format(website=website,sublink=sublink)
driver.get(restaurant_link)
time.sleep(5)
try:
if("sipariş verebilirsiniz." in driver.find_element_by_xpath('//*[@id="restaurantDetail"]/div/div[2]/h3').text):
continue
else:
pass
except Exception:
pass
minimum_amount = ""
service_time = ""
for shortInfoItem in driver.find_elements_by_class_name('shortInfoItem'):
if("Min. Tutar" in shortInfoItem.text):
minimum_amount = shortInfoItem.text.replace("Min. Tutar\n","").replace(" TL", "").replace(",",".")
if("Servis Süresi" in shortInfoItem.text):
service_time = shortInfoItem.text.replace("Servis Süresi\n","").replace(" dk.", "")
if(minimum_amount == "" and service_time == ""):
continue
service_df = pd.DataFrame([[sublink, minimum_amount, service_time]], columns=["RESTAURANT_ID","MINIMUM_AMOUNT","SERVICE_TIME"])
df_column_conversation(service_df, 'RESTAURANT_ID', 'text')
df_column_conversation(service_df, 'MINIMUM_AMOUNT', 'text')
df_column_conversation(service_df, 'SERVICE_TIME', 'text')
service_df['DATE'] = "'"+ datetime.strftime(date.today(), "%Y-%m-%d") + "'::date"
execute_mogrify(conn,service_df,"ODS","EXT_FB_RESTAURANT_SERVICE")
driver.quit()
sql_command = """
SELECT DISTINCT
"IMAGE_LINK"
FROM "ODS"."EXT_FB_PRODUCT_IMAGE" EFPI
WHERE 1=1
AND NOT EXISTS(SELECT NULL FROM "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE" EFPIS WHERE EFPIS."IMAGE_LINK" = EFPI."IMAGE_LINK");
"""
image_df = pd.read_sql(sql_command,conn)
image_df = image_df.sample(frac=1).reset_index(drop=True)
image_df
| IMAGE_LINK | |
|---|---|
| 0 | https://cdn.yemeksepeti.com/ProductImages/TR_I... |
for i in range(len(image_df)):
page_url = image_df.loc[i,"IMAGE_LINK"]
image_file_name = "images/product.jpg"
try:
urllib.request.urlretrieve(page_url, image_file_name)
except:
continue
image_binary = psycopg2.Binary(open(image_file_name, 'rb').read())
cur.execute("""
INSERT INTO "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE" ("IMAGE_LINK","SOURCE")
VALUES('{image_link}',{image_binary})
ON CONFLICT DO NOTHING;
""".format(image_link=page_url, image_binary=image_binary))
cur.execute('COMMIT;')
Looping through all of the restaurants and scraping their product images for deep learning purposes.
cur.execute("""
INSERT INTO "EDW"."DWH_FB_MENU"
SELECT
EFM."PRODUCT_ID",
EFM."RESTAURANT_ID",
EFM."CATEGORY_NAME",
EFM."PRODUCT_NAME",
EFM."PRODUCT_DESCRIPTION",
REPLACE(REPLACE(EFM."PRODUCT_LISTED_PRICE",' TL',''),',','.')::numeric AS "PRODUCT_LISTED_PRICE",
REPLACE(REPLACE(EFM."PRODUCT_PRICE",' TL',''),',','.')::numeric AS "PRODUCT_PRICE",
EFM."DISCOUNT",
EFM."DESIGN_TYPE",
EFM."DATE"
FROM "ODS"."EXT_FB_MENU" EFM
ON CONFLICT DO NOTHING;
""")
cur.execute('COMMIT;')
cur.execute("""
INSERT INTO "EDW"."DWH_FB_RESTAURANT_SERVICE"
SELECT
EFRS."RESTAURANT_ID",
EFRS."MINIMUM_AMOUNT"::numeric AS "MINIMUM_AMOUNT",
LEFT(EFRS."SERVICE_TIME", STRPOS(EFRS."SERVICE_TIME", '-')-1)::integer AS "MINIMUM_SERVICE_TIME",
RIGHT(EFRS."SERVICE_TIME", STRPOS(EFRS."SERVICE_TIME", '-')-1)::integer AS "MAXIMUM_SERVICE_TIME",
EFRS."DATE"
FROM "ODS"."EXT_FB_RESTAURANT_SERVICE" EFRS
ON CONFLICT DO NOTHING;
""")
cur.execute('COMMIT;')
cur.execute("""
INSERT INTO "EDW"."DWH_FB_RESTAURANT"
WITH STREET AS(
SELECT
EFR."RESTAURANT_ID",
EFR."RESTAURANT_NAME",
SUBSTRING(EFR."RESTAURANT_NAME", '\((.+)\)') AS "RESTAURANT_STREET",
EFR."RESTAURANT_LINK",
EFR."DATE"
FROM "ODS"."EXT_FB_RESTAURANT" EFR
)
SELECT
EFR."RESTAURANT_ID",
EFR."RESTAURANT_NAME",
SPLIT_PART(REPLACE(REPLACE(REPLACE(EFR."RESTAURANT_NAME",COALESCE(EFR."RESTAURANT_STREET",''),''),'(',''),')',''),',',1) AS "RESTAURANT_BRAND",
REVERSE(SPLIT_PART(REVERSE(REPLACE(REPLACE(REPLACE(EFR."RESTAURANT_NAME",COALESCE(EFR."RESTAURANT_STREET",''),''),'(',''),')','')),',',1)) AS "RESTAURANT_DISTRICT",
EFR."RESTAURANT_STREET",
EFR."RESTAURANT_LINK",
EFR."DATE"
FROM STREET EFR
ON CONFLICT DO NOTHING;
""")
cur.execute('COMMIT;')
if(last_execution_date < end_date):
sql_command = """
WITH CLEAN_DATA AS(
SELECT
EFC."RESTAURANT_ID",
EFC."USERNAME",
LOWER(EFC."COMMENT_TEXT") AS "COMMENT_TEXT",
EFC."COMMENT_DATE",
TRY_CAST(REGEXP_REPLACE(EFC."SPEED", '\D','','g'),NULL::INTEGER) AS "SPEED",
TRY_CAST(REGEXP_REPLACE(EFC."SERVING", '\D','','g'),NULL::INTEGER) AS "SERVING",
TRY_CAST(REGEXP_REPLACE(EFC."FLAVOUR", '\D','','g'),NULL::INTEGER) AS "FLAVOUR",
EFC."DATE",
REGEXP_REPLACE(EFC."COMMENT_DATE", '\D','','g')||' '||REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(REPLACE(EFC."COMMENT_DATE",' önce',''), '[^[:alpha:]]', '', 'g'),'ay','month'),'bugün','today'),'gün','day') AS "COMMENT_DATE_INTERVAL"
FROM "ODS"."EXT_FB_COMMENT" EFC
WHERE 1=1
AND EFC."USERNAME" <> 'Yemeksepeti'
)
SELECT
CD."RESTAURANT_ID",
CD."USERNAME",
CD."COMMENT_TEXT",
CASE WHEN CD."COMMENT_DATE_INTERVAL" = ' today' THEN CD."DATE" ELSE CD."DATE" - CAST(CD."COMMENT_DATE_INTERVAL" AS INTERVAL) END::date AS "COMMENT_DATE",
CD."SPEED",
CD."SERVING",
CD."FLAVOUR",
CD."DATE"
FROM CLEAN_DATA CD;
"""
comment_df = pd.read_sql(sql_command,conn)
comment_df['COMMENT_TEXT'] = comment_df['COMMENT_TEXT'].apply(clean_text)
comment_df
df_column_conversation(comment_df, 'RESTAURANT_ID', 'text')
df_column_conversation(comment_df, 'USERNAME', 'text')
df_column_conversation(comment_df, 'COMMENT_TEXT', 'text')
df_column_conversation(comment_df, 'COMMENT_DATE', 'date')
df_column_conversation(comment_df, 'SPEED', 'integer')
df_column_conversation(comment_df, 'SERVING', 'integer')
df_column_conversation(comment_df, 'FLAVOUR', 'integer')
df_column_conversation(comment_df, 'DATE', 'date')
comment_df.replace('<NA>', 'NULL', inplace=True)
execute_mogrify(conn,comment_df,"EDW","DWH_FB_COMMENT")
sql_command = """
SELECT
*
FROM "EDW"."DWH_FB_COMMENT" EFR;
"""
comment_df = pd.read_sql(sql_command,conn)
comment_df
| RESTAURANT_ID | USERNAME | COMMENT_TEXT | COMMENT_DATE | SPEED | SERVING | FLAVOUR | DATE | |
|---|---|---|---|---|---|---|---|---|
| 0 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...o | herzamanki gibi mükemmel ellerinize saglik | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
| 1 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | favori pizzacım genelde madness söylüyordum yo... | 2021-05-25 | 10.0 | 10 | 10 | 2021-06-09 |
| 2 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...t | şahane şimdi tekrarlıyorum siparişi | 2021-05-25 | 10.0 | 10 | 10 | 2021-06-09 |
| 3 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | hamur seçimi yapılamıyor | 2021-05-24 | 10.0 | 9 | 7 | 2021-06-09 |
| 4 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...1 | pizza kesilmeden gönderilmişti lütfen dikkat e... | 2021-05-24 | 10.0 | 8 | 10 | 2021-06-09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 271838 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | elinize sağlık çok başarılı | 2021-05-27 | 10.0 | 10 | 10 | 2021-06-09 |
| 271839 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...i | super | 2021-05-27 | 10.0 | 10 | 10 | 2021-06-09 |
| 271840 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...2 | pizzadan anlamayanların beğendiği bir pizza do... | 2021-05-27 | 10.0 | 5 | 5 | 2021-06-09 |
| 271841 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | cok guzeldi bu sefer tesekkurler | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
| 271842 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | hawaii pizza ve yarı yarıya pizzalar harika in... | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
271843 rows × 8 columns
sql_command = """
WITH AVERAGE_SCORES AS(
SELECT
DFC."RESTAURANT_ID",
AVG(DFC."SPEED") AS "SPEED",
AVG(DFC."SERVING") AS "SERVING",
AVG(DFC."FLAVOUR") AS "FLAVOUR"
FROM "EDW"."DWH_FB_COMMENT" DFC
WHERE 1=1
AND DFC."SPEED" IS NOT NULL
GROUP BY DFC."RESTAURANT_ID"
HAVING COUNT(*) >= 30
)
SELECT
DFR."RESTAURANT_ID",
DFR."RESTAURANT_BRAND",
DFR."RESTAURANT_DISTRICT",
DFR."RESTAURANT_STREET",
DFRS."MINIMUM_AMOUNT",
DFRS."MINIMUM_SERVICE_TIME",
DFRS."MAXIMUM_SERVICE_TIME",
AVS."SPEED",
AVS."SERVING",
AVS."FLAVOUR"
FROM "EDW"."DWH_FB_RESTAURANT" DFR
INNER JOIN "EDW"."DWH_FB_RESTAURANT_SERVICE" DFRS ON (DFR."RESTAURANT_ID" = DFRS."RESTAURANT_ID")
INNER JOIN AVERAGE_SCORES AVS ON (DFR."RESTAURANT_ID" = AVS."RESTAURANT_ID");
"""
service_df = pd.read_sql(sql_command,conn)
service_df
| RESTAURANT_ID | RESTAURANT_BRAND | RESTAURANT_DISTRICT | RESTAURANT_STREET | MINIMUM_AMOUNT | MINIMUM_SERVICE_TIME | MAXIMUM_SERVICE_TIME | SPEED | SERVING | FLAVOUR | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100-homemade-pauls-pizza-lasagna-kadikoy-cafer... | %100 Homemade Pauls Pizza & Lasagna | Kadıköy | Caferağa Mah. - Moda | 0.0 | 20 | 30 | 9.836364 | 9.890909 | 9.563636 |
| 1 | 1991-tarihi-ortakoy-kumpircisi-silivri-yeni-ma... | 1991 Tarihi Ortaköy Kumpircisi | Silivri | Yeni Mah. | 30.0 | 20 | 30 | 8.803279 | 8.606557 | 8.606557 |
| 2 | 400-c-pizza-sisli-tesvikiye-mah-istanbul | 400° C Pizza | Şişli | Teşvikiye Mah. | 0.0 | 35 | 45 | 8.837607 | 8.350427 | 7.957265 |
| 3 | 400-c-pizza-umraniye-serifali-mah-istanbul | 400° C Pizza | Ümraniye | Şerifali Mah. | 5.0 | 30 | 40 | 8.398438 | 7.781250 | 8.164062 |
| 4 | 444-pizza-bagcilar-demirkapi-mah-istanbul | 444 Pizza | Bağcılar | Demirkapı Mah. | 0.0 | 35 | 45 | 7.392070 | 6.885463 | 6.986784 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | yummy-pizza-pendik-esenyali-mah-istanbul | Yummy Pizza | Pendik | Esenyalı Mah. | 5.0 | 50 | 60 | 8.113636 | 7.931818 | 7.636364 |
| 887 | yuz-seksen-gram-pizza-umraniye-ataturk-mah-ist... | Yüz Seksen Gram Pizza | Ümraniye | Atatürk Mah. | 5.0 | 20 | 30 | 9.874286 | 9.720000 | 9.765714 |
| 888 | zi-burger-pizza-basaksehir-bahcesehir-2-kisim-... | Zi Burger & Pizza | Başakşehir | Bahçeşehir 2. Kısım Mah. | 35.0 | 35 | 45 | 8.704918 | 8.540984 | 8.557377 |
| 889 | ziyadem-kebap-bahcelievler-kocasinan-mah-istanbul | Ziyadem Kebap | Bahçelievler | Kocasinan Mah. | 15.0 | 30 | 40 | 8.250000 | 7.640625 | 7.632812 |
| 890 | zucca-pizza-sisli-bozkurt-mah-kurtulus-istanbul | Zucca Pizza | Şişli | Bozkurt Mah. - Kurtuluş | 5.0 | 20 | 30 | 9.400000 | 8.996078 | 8.698039 |
891 rows × 10 columns
stop_words = [element for element in stopwords.words('turkish') if element not in ['çok','eğer','gibi','hiç','niçin','niye','sanki','yani','en','az','birkaç','bazı','aslında','neden','hepsi']]
comment_df['COMMENT_TEXT'] = comment_df['COMMENT_TEXT'].apply(lambda x: ' '.join([word for word in x.lower().split() if word not in (stop_words)]))
#comment_df['COMMENT_TEXT'] = comment_df['COMMENT_TEXT'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
comment_df
| RESTAURANT_ID | USERNAME | COMMENT_TEXT | COMMENT_DATE | SPEED | SERVING | FLAVOUR | DATE | |
|---|---|---|---|---|---|---|---|---|
| 0 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...o | herzamanki gibi mükemmel ellerinize saglik | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
| 1 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | favori pizzacım genelde madness söylüyordum yo... | 2021-05-25 | 10.0 | 10 | 10 | 2021-06-09 |
| 2 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...t | şahane şimdi tekrarlıyorum siparişi | 2021-05-25 | 10.0 | 10 | 10 | 2021-06-09 |
| 3 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | hamur seçimi yapılamıyor | 2021-05-24 | 10.0 | 9 | 7 | 2021-06-09 |
| 4 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...1 | pizza kesilmeden gönderilmişti lütfen dikkat e... | 2021-05-24 | 10.0 | 8 | 10 | 2021-06-09 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 271838 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | elinize sağlık çok başarılı | 2021-05-27 | 10.0 | 10 | 10 | 2021-06-09 |
| 271839 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...i | super | 2021-05-27 | 10.0 | 10 | 10 | 2021-06-09 |
| 271840 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...2 | pizzadan anlamayanların beğendiği bir pizza do... | 2021-05-27 | 10.0 | 5 | 5 | 2021-06-09 |
| 271841 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...n | cok guzeldi sefer tesekkurler | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
| 271842 | pizza-hot-slice-cekmekoy-tasdelen-mah-istanbul | ...5 | hawaii pizza yarı yarıya pizzalar harika incec... | 2021-05-26 | 10.0 | 10 | 10 | 2021-06-09 |
271843 rows × 8 columns
Cleaning our review dataset with lowercasing all the text, removing punctuations, removing some Turkish stopwords, and estimating when the reviews are made, we are also transforming string scores to the integer type.
analyze_data('eda_restaurant.html',False,'ODS','EXT_FB_RESTAURANT')
analyze_data('eda_menu.html',False,'EDW','DWH_FB_MENU')
analyze_data('eda_product_image.html',False,'ODS','EXT_FB_PRODUCT_IMAGE')
analyze_data('eda_comment.html',False,'EDW','DWH_FB_COMMENT')
analyze_dataframe('eda_service.html',False,service_df)
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_restaurant.html was generated. You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_menu.html was generated. You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_product_image.html was generated. You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_comment.html was generated.
Feature: RESTAURANT_ID |▉ | [ 9%] 00:00 -> (00:00 left)
You can see the exploratory data analysis visualizations in your browser.
Done! Use 'show' commands to display/save. |██████████| [100%] 00:00 -> (00:00 left)
Report eda_service.html was generated. You can see the exploratory data analysis visualizations in your browser.
We have 1,189 distinct resturants, 131,131 distinct products with 1,077 distinct categories and 23,109 dstinct names only 11% of the products are discounted.
Our product prices are averaging around 41 with 30.4 standart deviation. We have around 5,112 product images.
In user reviews we have 271,843 comments which 221,312 of them are distinct in user scores most of the user scores are 10 with around 56-64% of the scores and the second most common score is 1 with around 10-13% of the scores.
Dataset does not contain any missing data except in user speed scores which contains less than 1% of the data with 3,495 rows.
if(not(check_if_file_exists('images/word_cloud_popular_words.png'))):
sql_command = """
WITH WORDS AS(
SELECT
regexp_split_to_table(regexp_replace("COMMENT_TEXT", '[^[:alpha:]]', '', 'g'), '\s+(\w\s+)*') AS "WORD"
FROM "EDW"."DWH_FB_COMMENT"
)
SELECT
"WORD",
COUNT(*) AS "WORD_COUNT"
FROM WORDS
WHERE LENGTH("WORD") > 1
GROUP BY WORDS."WORD"
ORDER BY COUNT(*) DESC
LIMIT 1000;
"""
df_word_cloud = pd.read_sql(sql_command,conn)
df_word_cloud['WORD'] = df_word_cloud['WORD'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df_word_cloud['WORD'] = df_word_cloud['WORD'].apply(turkish_to_english_characters)
df_word_cloud.groupby(['WORD'], as_index=False)['WORD_COUNT'].sum()
word_cloud_data = dict(zip(df_word_cloud['WORD'].tolist(), df_word_cloud['WORD_COUNT'].tolist()))
wc = WordCloud(background_color='white', width=8000, height=4000, max_words=10000).generate_from_frequencies(word_cloud_data)
wc.to_file('images/word_cloud_popular_words.png')
Image.open('images/word_cloud_popular_words.png')
We are going to transform our reviews with CountVectorizer, process is similar to one-hot encoding which counts how many of these words appear in a label.
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
#print(np.shape(labels))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_GaussianNB_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = GaussianNB()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 33.73% F1 Score for SPEED: 33.73% Precision Score for SPEED: 33.73% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 29.20% F1 Score for SERVING: 29.20% Precision Score for SERVING: 29.20% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 28.80% F1 Score for FLAVOUR: 28.80% Precision Score for FLAVOUR: 28.80%
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_SVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = SVC()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 71.87% F1 Score for SPEED: 71.87% Precision Score for SPEED: 71.87% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 67.30% F1 Score for SERVING: 67.30% Precision Score for SERVING: 67.30% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 64.90% F1 Score for FLAVOUR: 64.90% Precision Score for FLAVOUR: 64.90%
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_LinearSVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LinearSVC()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 68.13% F1 Score for SPEED: 68.13% Precision Score for SPEED: 68.13% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 64.20% F1 Score for SERVING: 64.20% Precision Score for SERVING: 64.20% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 63.60% F1 Score for FLAVOUR: 63.60% Precision Score for FLAVOUR: 63.60%
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_DecisionTreeClassifier_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
params = {
'max_depth': [10,13,14,15],
'min_samples_split': [2,3,4]
}
gscv = GridSearchCV(DecisionTreeClassifier(), params, cv=5)
gscv.fit(features_train, labels_train)
print("Best parameters: {best_parameters}".format(best_parameters = gscv.best_params_))
model = gscv.best_estimator_
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 72.20% F1 Score for SPEED: 72.20% Precision Score for SPEED: 72.20% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 66.83% F1 Score for SERVING: 66.83% Precision Score for SERVING: 66.83% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 64.87% F1 Score for FLAVOUR: 64.87% Precision Score for FLAVOUR: 64.87%
for label in ['SPEED','SERVING','FLAVOUR']:
model = CountVectorizer(min_df=3)
not_null_df = comment_df[0:9600][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_CountVectorizer_LogisticRegression_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LogisticRegression(penalty='l2', C=1.2, n_jobs=-1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3434 Accuracy Score for SPEED: 70.90% F1 Score for SPEED: 70.90% Precision Score for SPEED: 70.90% Total Features after vectorizing: 3434 Accuracy Score for SERVING: 67.33% F1 Score for SERVING: 67.33% Precision Score for SERVING: 67.33% Total Features after vectorizing: 3434 Accuracy Score for FLAVOUR: 65.73% F1 Score for FLAVOUR: 65.73% Precision Score for FLAVOUR: 65.73%
We are going to transform our reviews with TF-IDF Vectorizer, process is similar to one-hot encoding which calculates how frequently these words appear in a label.
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
#print(np.shape(labels))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_GaussianNB_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = GaussianNB()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 34.10% F1 Score for SPEED: 34.10% Precision Score for SPEED: 34.10% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 29.37% F1 Score for SERVING: 29.37% Precision Score for SERVING: 29.37% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 29.10% F1 Score for FLAVOUR: 29.10% Precision Score for FLAVOUR: 29.10%
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_SVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
params = {
'C': [1.5],
#'C': [1.4,1.5,1.6],
'kernel': ['rbf']
#'kernel': ['linear','rbf','sigmoid']
}
gscv = GridSearchCV(SVC(), params, cv=5)
gscv.fit(features_train, labels_train)
print("Best parameters: {best_parameters}".format(best_parameters = gscv.best_params_))
model = gscv.best_estimator_
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 72.87% F1 Score for SPEED: 72.87% Precision Score for SPEED: 72.87% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 68.50% F1 Score for SERVING: 68.50% Precision Score for SERVING: 68.50% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 66.83% F1 Score for FLAVOUR: 66.83% Precision Score for FLAVOUR: 66.83%
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:10000][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_LinearSVC_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LinearSVC()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3526 Accuracy Score for SPEED: 70.63% F1 Score for SPEED: 70.63% Precision Score for SPEED: 70.63% Total Features after vectorizing: 3526 Accuracy Score for SERVING: 66.77% F1 Score for SERVING: 66.77% Precision Score for SERVING: 66.77% Total Features after vectorizing: 3526 Accuracy Score for FLAVOUR: 66.30% F1 Score for FLAVOUR: 66.30% Precision Score for FLAVOUR: 66.30%
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:9600][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_DecisionTreeClassifier_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
params = {
'max_depth': [9,10,11],
'min_samples_split': [2,3]
}
gscv = GridSearchCV(DecisionTreeClassifier(), params, cv=5)
gscv.fit(features_train, labels_train)
print("Best parameters: {best_parameters}".format(best_parameters = gscv.best_params_))
model = gscv.best_estimator_
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3434 Accuracy Score for SPEED: 71.56% F1 Score for SPEED: 71.56% Precision Score for SPEED: 71.56% Total Features after vectorizing: 3434 Accuracy Score for SERVING: 66.70% F1 Score for SERVING: 66.70% Precision Score for SERVING: 66.70% Total Features after vectorizing: 3434 Accuracy Score for FLAVOUR: 64.44% F1 Score for FLAVOUR: 64.44% Precision Score for FLAVOUR: 64.44%
for label in ['SPEED','SERVING','FLAVOUR']:
model = TfidfVectorizer(min_df=3)
not_null_df = comment_df[0:9600][pd.notnull(comment_df[label])]
features = model.fit_transform(not_null_df['COMMENT_TEXT'].values).todense()
labels = not_null_df[label].values
print("Total Features after vectorizing: {total_features}".format(total_features = np.shape(features)[1]))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/BoW_TFIDFVectorizer_LogisticRegression_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LogisticRegression(penalty='l2', C=1.2, n_jobs=-1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Accuracy Score for {label}: {accuracy_score:0.2f}%".format(accuracy_score = accuracy_score(labels_test, label_prediction)*100, label=label))
print("F1 Score for {label}: {f1_score:0.2f}%".format(f1_score = f1_score(labels_test, label_prediction, average='micro')*100, label=label))
print("Precision Score for {label}: {precision_score:0.2f}%\n".format(precision_score = precision_score(labels_test, label_prediction, average='micro')*100, label=label))
Total Features after vectorizing: 3434 Accuracy Score for SPEED: 72.64% F1 Score for SPEED: 72.64% Precision Score for SPEED: 72.64% Total Features after vectorizing: 3434 Accuracy Score for SERVING: 68.61% F1 Score for SERVING: 68.61% Precision Score for SERVING: 68.61% Total Features after vectorizing: 3434 Accuracy Score for FLAVOUR: 66.63% F1 Score for FLAVOUR: 66.63% Precision Score for FLAVOUR: 66.63%
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
#features = not_null_df[['RESTAURANT_DISTRICT','RESTAURANT_BRAND','MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_LinearRegression_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LinearRegression(n_jobs=-1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.2f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.64 R2 Score for SPEED: 0.00 Mean Squared Error for SERVING: 0.96 R2 Score for SERVING: -0.00 Mean Squared Error for FLAVOUR: 0.97 R2 Score for FLAVOUR: -0.00
for label in ['SPEED','SERVING','FLAVOUR']:
feature_columns = ['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']
model_name = "LGBMRegressor"
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[feature_columns].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_{model_name}_{label}.mdl".format(label = label, model_name=model_name)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = LGBMRegressor(boosting_type='goss', n_jobs=-1, n_estimators=100, num_leaves=31, max_depth=-1, learning_rate=0.05, reg_alpha=0.0, reg_lambda=0.0)
model.fit(features_train, labels_train)
#joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
feature_importance_plot(label, model_name, feature_columns, model.feature_importances_)
merge_images(['images/SPEED.png', 'images/SERVING.png', 'images/FLAVOUR.png'])
Mean Squared Error for SPEED: 0.36 R2 Score for SPEED: 0.4287 Mean Squared Error for SERVING: 0.50 R2 Score for SERVING: 0.4717 Mean Squared Error for FLAVOUR: 0.51 R2 Score for FLAVOUR: 0.4703
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_XGBRegressor_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = XGBRegressor()
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.40 R2 Score for SPEED: 0.3740 Mean Squared Error for SERVING: 0.54 R2 Score for SERVING: 0.4329 Mean Squared Error for FLAVOUR: 0.54 R2 Score for FLAVOUR: 0.4395
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_SGDRegressor_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = SGDRegressor(alpha=0.0001)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 873.17 R2 Score for SPEED: -1367.7777 Mean Squared Error for SERVING: 2616.85 R2 Score for SERVING: -2748.1079 Mean Squared Error for FLAVOUR: 857881.96 R2 Score for FLAVOUR: -888904.5494
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_KernelRidge_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = KernelRidge(kernel='polynomial', alpha=1, degree=5, coef0=1)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.55 R2 Score for SPEED: 0.1367 Mean Squared Error for SERVING: 0.83 R2 Score for SERVING: 0.1256 Mean Squared Error for FLAVOUR: 0.81 R2 Score for FLAVOUR: 0.1602
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_ElasticNet_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = ElasticNet(normalize=True)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.64 R2 Score for SPEED: -0.0015 Mean Squared Error for SERVING: 0.95 R2 Score for SERVING: -0.0002 Mean Squared Error for FLAVOUR: 0.97 R2 Score for FLAVOUR: -0.0004
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_BayesianRidge_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = BayesianRidge(normalize=True)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.63 R2 Score for SPEED: 0.0099 Mean Squared Error for SERVING: 0.95 R2 Score for SERVING: 0.0019 Mean Squared Error for FLAVOUR: 0.96 R2 Score for FLAVOUR: 0.0020
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_GradientBoostingRegressor_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = GradientBoostingRegressor(loss='ls', n_estimators=50, min_samples_split=20, min_samples_leaf=5, max_depth=3)
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.38 R2 Score for SPEED: 0.4108 Mean Squared Error for SERVING: 0.51 R2 Score for SERVING: 0.4606 Mean Squared Error for FLAVOUR: 0.51 R2 Score for FLAVOUR: 0.4727
for label in ['SPEED','SERVING','FLAVOUR']:
not_null_df = service_df[pd.notnull(comment_df[label])]
features = not_null_df[['MINIMUM_AMOUNT','MINIMUM_SERVICE_TIME','MAXIMUM_SERVICE_TIME']].values
labels = not_null_df[label].values
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model_file_name = "models/Service_SVR_{label}.mdl".format(label = label)
if(check_if_file_exists(model_file_name)):
model = joblib.load(model_file_name)
else:
model = SVR(kernel='rbf')
model.fit(features_train, labels_train)
joblib.dump(model, model_file_name)
label_prediction = model.predict(features_test)
print("Mean Squared Error for {label}: {mse:0.2f}".format(mse = mean_squared_error(labels_test, label_prediction), label=label))
print("R2 Score for {label}: {r2:0.4f}\n".format(r2 = r2_score(labels_test, label_prediction), label=label))
Mean Squared Error for SPEED: 0.53 R2 Score for SPEED: 0.1693 Mean Squared Error for SERVING: 0.79 R2 Score for SERVING: 0.1743 Mean Squared Error for FLAVOUR: 0.80 R2 Score for FLAVOUR: 0.1722
sql_command = """
WITH EFM AS(
SELECT DISTINCT
EFM."CATEGORY_NAME",
EFM."PRODUCT_NAME",
EFM."PRODUCT_DESCRIPTION",
EFM."PRODUCT_LISTED_PRICE",
EFM."PRODUCT_PRICE",
EFM."DISCOUNT"
FROM "EDW"."DWH_FB_MENU" EFM
WHERE 1=1
AND CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN 1 ELSE 0 END = 1
AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%soslar%' AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%yan %'
),
QUARTILES AS(
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q1,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q2,
PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q3
FROM EFM
),
OUTLIER_RANGES AS(
SELECT
QR.Q1,
QR.Q2,
QR.Q3,
QR.Q3-QR.Q1 AS IQR,
QR.Q1-(1.5*(QR.Q3-QR.Q1)) AS MINIMA,
QR.Q3+(1.5*(QR.Q3-QR.Q1)) AS MAXIMA
FROM QUARTILES QR
),
PIZZAS AS(
SELECT *
FROM EFM
INNER JOIN OUTLIER_RANGES ORA ON (1=1)
WHERE EFM."PRODUCT_LISTED_PRICE" BETWEEN ORA.MINIMA AND ORA.MAXIMA
ORDER BY EFM."PRODUCT_LISTED_PRICE"
)
SELECT
EFM.*,
PIZ.Q1,
PIZ.Q2,
PIZ.Q3,
PIZ.IQR,
PIZ.MINIMA,
PIZ.MAXIMA
FROM "EDW"."DWH_FB_MENU" EFM
INNER JOIN PIZZAS PIZ ON (EFM."CATEGORY_NAME" = PIZ."CATEGORY_NAME" AND EFM."PRODUCT_NAME" = PIZ."PRODUCT_NAME" AND EFM."PRODUCT_DESCRIPTION" = PIZ."PRODUCT_DESCRIPTION" AND EFM."PRODUCT_LISTED_PRICE" = PIZ."PRODUCT_LISTED_PRICE");
"""
menu_df = pd.read_sql(sql_command,conn)
menu_df.head()
| PRODUCT_ID | RESTAURANT_ID | CATEGORY_NAME | PRODUCT_NAME | PRODUCT_DESCRIPTION | PRODUCT_LISTED_PRICE | PRODUCT_PRICE | DISCOUNT | DESIGN_TYPE | DATE | q1 | q2 | q3 | iqr | minima | maxima | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4F9A1222-2174-4794-85BE-78B864137B58 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Vodafone Menüleri | Vodafone Menüsü (Pizza Refanzine (32 cm.)) | Pizza Refanzine (32 cm.) + Coca-Cola (33 cl.) ... | 52.0 | 52.0 | False | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 1 | 8b503b63-b162-4b43-a58c-301a41c6b98a | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Rucola (25 cm.) | Dante sos, mozzarella peyniri, roka, cherry do... | 66.0 | 33.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 2 | 15cff837-c4f5-4ded-b09e-dd8c6fd5e591 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Funghi (25 cm.) | Dante sos, mozzarella peyniri, kültür mantarı | 58.0 | 29.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 3 | ef08f896-366b-40b0-9858-6cdc69401f41 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Quattro Formaggi (25 cm.) | Dante sos, mozzarella peyniri, parmesan peynir... | 76.0 | 38.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
| 4 | e7e2b046-6d7b-4636-bcd7-59be32110798 | pizza-dante-sisli-ergenekon-mah-pangalti-istanbul | Klasik & Sebzeli Pizzalar (25 cm.) | Pizza Mediterraneo (25 cm.) | Dante sos, mozzarella peyniri, beyaz peynir, y... | 66.0 | 33.0 | True | list | 2021-06-03 | 36.9 | 47.0 | 61.0 | 24.1 | 0.75 | 97.15 |
sql_command = """
WITH EFM AS(
SELECT DISTINCT
EFM."CATEGORY_NAME",
EFM."PRODUCT_NAME",
EFM."PRODUCT_DESCRIPTION",
EFM."PRODUCT_LISTED_PRICE",
EFM."PRODUCT_PRICE",
EFM."DISCOUNT"
FROM "EDW"."DWH_FB_MENU" EFM
WHERE 1=1
AND CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN 1 ELSE 0 END = 1
AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%soslar%' AND LOWER(EFM."CATEGORY_NAME") NOT LIKE '%yan %'
),
QUARTILES AS(
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q1,
PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q2,
PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY EFM."PRODUCT_LISTED_PRICE") AS Q3
FROM EFM
),
OUTLIER_RANGES AS(
SELECT
QR.Q1,
QR.Q2,
QR.Q3,
QR.Q3-QR.Q1 AS IQR,
QR.Q1-(1.5*(QR.Q3-QR.Q1)) AS MINIMA,
QR.Q3+(1.5*(QR.Q3-QR.Q1)) AS MAXIMA
FROM QUARTILES QR
),
PIZZAS AS(
SELECT *
FROM EFM
INNER JOIN OUTLIER_RANGES ORA ON (1=1)
WHERE EFM."PRODUCT_LISTED_PRICE" BETWEEN ORA.MINIMA AND ORA.MAXIMA
ORDER BY EFM."PRODUCT_LISTED_PRICE"
)
SELECT
EFM."RESTAURANT_ID",
AVG(EFM."PRODUCT_LISTED_PRICE") AS "AVERAGE_PRODUCT_LISTED_PRICE",
COUNT(*) AS "NUMBER_OF_PRODUCTS"
FROM "EDW"."DWH_FB_MENU" EFM
INNER JOIN PIZZAS PIZ ON (EFM."CATEGORY_NAME" = PIZ."CATEGORY_NAME" AND EFM."PRODUCT_NAME" = PIZ."PRODUCT_NAME" AND EFM."PRODUCT_DESCRIPTION" = PIZ."PRODUCT_DESCRIPTION" AND EFM."PRODUCT_LISTED_PRICE" = PIZ."PRODUCT_LISTED_PRICE")
GROUP BY EFM."RESTAURANT_ID";
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df.head()
| RESTAURANT_ID | AVERAGE_PRODUCT_LISTED_PRICE | NUMBER_OF_PRODUCTS | |
|---|---|---|---|
| 0 | 1991-tarihi-ortakoy-kumpircisi-silivri-yeni-ma... | 41.600000 | 10 |
| 1 | 400-c-pizza-sisli-tesvikiye-mah-istanbul | 62.042553 | 47 |
| 2 | 400-c-pizza-umraniye-serifali-mah-istanbul | 63.476190 | 42 |
| 3 | 444-pizza-bagcilar-demirkapi-mah-istanbul | 43.163793 | 58 |
| 4 | 444-pizza-eyup-emniyettepe-mah-istanbul | 43.818462 | 65 |
if not(check_if_file_exists('images/kmeans_product_cluster.png')):
model = KMeans(n_clusters=5).fit(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
labels = model.predict(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
plt.scatter(menu_df["PRODUCT_LISTED_PRICE"].values,menu_df["PRODUCT_PRICE"].values,c=labels, cmap='viridis');
plt.xlabel("Listed Price")
plt.ylabel("Price with Discount")
plt.title('K-Means Clustering on Products')
plt.savefig('images/kmeans_product_cluster.png')
plt.close()
Image.open('images/kmeans_product_cluster.png')
if not(check_if_file_exists('images/kmeans_restaurant_cluster.png')):
model = KMeans(n_clusters=5).fit(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
labels = model.predict(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
plt.scatter(restaurant_df["AVERAGE_PRODUCT_LISTED_PRICE"].values,restaurant_df["NUMBER_OF_PRODUCTS"].values,c=labels, cmap='viridis');
plt.xlabel("Average Listed Price")
plt.ylabel("Number of Products")
plt.title('K-Means Clustering on Restaurants')
plt.savefig('images/kmeans_restaurant_cluster.png')
plt.close()
Image.open('images/kmeans_restaurant_cluster.png')
if not(check_if_file_exists('images/kmeans_log_restaurant_cluster.png')):
temp_df = np.log(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]]+1)
model = KMeans(n_clusters=5).fit(temp_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
labels = model.predict(temp_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
plt.scatter(temp_df["AVERAGE_PRODUCT_LISTED_PRICE"].values,temp_df["NUMBER_OF_PRODUCTS"].values,c=labels, cmap='viridis');
plt.xlabel("Average Listed Price")
plt.ylabel("Number of Products")
plt.title('K-Means Clustering on Restaurants with Log Transformation')
plt.savefig('images/kmeans_log_restaurant_cluster.png')
plt.close()
Image.open('images/kmeans_log_restaurant_cluster.png')
if not(check_if_file_exists('images/gmm_product_cluster.png')):
model = mixture.GaussianMixture(n_components=5).fit(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
labels = model.predict(menu_df[["PRODUCT_LISTED_PRICE","PRODUCT_PRICE"]].values)
plt.scatter(menu_df["PRODUCT_LISTED_PRICE"].values,menu_df["PRODUCT_PRICE"].values,c=labels, cmap='viridis');
plt.xlabel("Listed Price")
plt.ylabel("Price with Discount")
plt.title('GMM Clustering on Products')
plt.savefig('images/gmm_product_cluster.png')
plt.close()
Image.open('images/gmm_product_cluster.png')
if not(check_if_file_exists('images/gmm_restaurant_cluster.png')):
model = mixture.GaussianMixture(n_components=5).fit(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
labels = model.predict(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
plt.scatter(restaurant_df["AVERAGE_PRODUCT_LISTED_PRICE"].values,restaurant_df["NUMBER_OF_PRODUCTS"].values,c=labels, cmap='viridis');
plt.xlabel("Average Listed Price")
plt.ylabel("Number of Products")
plt.title('GMM Clustering on Restaurants')
plt.savefig('images/gmm_restaurant_cluster.png')
plt.close()
Image.open('images/gmm_restaurant_cluster.png')
if not(check_if_file_exists('images/gmm_log_restaurant_cluster.png')):
temp_df = np.log(restaurant_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]]+1)
model = mixture.GaussianMixture(n_components=5).fit(temp_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
labels = model.predict(temp_df[["AVERAGE_PRODUCT_LISTED_PRICE","NUMBER_OF_PRODUCTS"]].values)
plt.scatter(temp_df["AVERAGE_PRODUCT_LISTED_PRICE"].values,temp_df["NUMBER_OF_PRODUCTS"].values,c=labels, cmap='viridis');
plt.xlabel("Average Listed Price")
plt.ylabel("Number of Products")
plt.title('GMM Clustering on Restaurants with Log Transformation')
plt.savefig('images/gmm_log_restaurant_cluster.png')
plt.close()
Image.open('images/gmm_log_restaurant_cluster.png')
Markov chains calculate how frequently words follow each other and generate new text depending on the first words we are given. These models can be used to create fake reviews.
sql_command = """
WITH COMMENTS AS(
SELECT "RESTAURANT_ID",
STRING_AGG(LOWER("COMMENT_TEXT"), ' ') AS ALL_COMMENT,
COUNT("COMMENT_TEXT") AS COMMENT_COUNT
FROM "ODS"."EXT_FB_COMMENT" EFC
GROUP BY "RESTAURANT_ID"
)
SELECT *
FROM COMMENTS
ORDER BY COMMENT_COUNT DESC;
"""
restaurant_df = pd.read_sql(sql_command,conn)
restaurant_df.head()
| RESTAURANT_ID | all_comment | comment_count | |
|---|---|---|---|
| 0 | pizza-bulls-uskudar-altunizade-mah-istanbul | lezzeti eskisi gibi gelmedi bize. fark ödeyip ... | 2293 |
| 1 | pizza-bulls-uskudar-ferah-mah-istanbul | her şey çok güzeldi. elinize sağlık. hizli ve ... | 1764 |
| 2 | pizza-bulls-kartal-soganlik-yeni-mah-istanbul | cok pahali olmasi disinda bir problem yok gibi... | 1743 |
| 3 | pizza-bulls-umraniye-cakmak-mah-istanbul | çok lezzetli güvenle çoçuguma yedirebiliyorum ... | 1582 |
| 4 | pizza-bulls-atasehir-fetih-mah-istanbul | servis ve hız 10 üzerinden 20.. bu siparişi is... | 1277 |
import random
def make_markov_model(cleaned_stories, n_gram=2):
markov_model = {}
for i in range(len(cleaned_stories)-n_gram-1):
curr_state, next_state = "", ""
for j in range(n_gram):
curr_state += cleaned_stories[i+j] + " "
next_state += cleaned_stories[i+j+n_gram] + " "
curr_state = curr_state[:-1]
next_state = next_state[:-1]
if curr_state not in markov_model:
markov_model[curr_state] = {}
markov_model[curr_state][next_state] = 1
else:
if next_state in markov_model[curr_state]:
markov_model[curr_state][next_state] += 1
else:
markov_model[curr_state][next_state] = 1
# calculating transition probabilities
for curr_state, transition in markov_model.items():
total = sum(transition.values())
for state, count in transition.items():
markov_model[curr_state][state] = count/total
return markov_model
def generate_story(markov_model, limit=100, start='my god'):
n = 0
curr_state = start
next_state = None
story = ""
story+=curr_state+" "
while n<limit:
next_state = random.choices(list(markov_model[curr_state].keys()),
list(markov_model[curr_state].values()))
curr_state = next_state[0]
story+=curr_state+" "
n+=1
return story
restaurant_df['clean_comment'] = restaurant_df['all_comment'].apply(clean_text)
text = restaurant_df.loc[0]['clean_comment'].split()
markov_model = make_markov_model(text)
for i in range(20):
print(str(i)+". ", generate_story(markov_model, start="yemek çok", limit=20))
0. yemek çok lezzetliydi ve vaktinde geldi sıcaktı güler yüzlü teslimat için cumali beye teşekkür ederiz semih bey ve atakan beye teşekkürler kurye maskesizdi bu nedele 1 puan verdim lezzet ve hizmet güzeldi güler yüzlü bir şekilde teslim edilmesini sağlayan servis çalışanlarına ayrıca 1. yemek çok lezzetliydi ve her şey çok güzeldi tatlılarda ayrı bi güzeldi elllerinize sağlık semih beye teşekkürler suflemi getirmediniz bi daha asla sipariş vermiyicem kuryeniz mustafa beye nezaketi ve hizmet kalitesinden teşekkür ederiyorum bilal beye teşekkür ederiz pizza çok lezzetliydi ilginiz ve 2. yemek çok güzeldi semih beye teşekkürler her zamanki gibi çok güzeldi soğan halkası ikramı için ve ilgisi icin telefonda tesekkurler motorcu arkadaş çok efendiydi lezzet iyiydi her zaman lezzetli ayrıca cumali beye de ilgisi için çok teşekkürler muazzam bir lezzet muhteşem pizzalar 3. yemek çok lezzetliydi ve restoran bana çok yakın olmamasına rağmen sıcaktı da soğan halkaları da tam tersi az pişmişti ama pizzanın lezzeti malzeme kalitesi iyiydi sufle ikramı için bu ekibe teşekkür ederim binaya giriş için komşumun ziline basılmış ve geç saat olduğu 4. yemek çok lezzetliydi ve hızlıydı tesekkurler çok hızlı getirdi tüm pizza bulls şubeleri arasında türkiyede tek bence herkese tavsiye ederim efsane lezzetin sıcacık ve özenli servis için teşekkürler senelerdir burdan söylüyorum şu ana kadar kötü pizza ve en berbat yer pizza bulls 5. yemek çok lezzetliydi ve vaktinde geldi sıcaktı güler yüzlü teslimat için cumali beye teşekkür ederim bilal beye nezaketi ve alakası için teşekkür ederim atakan beye selam çok teşşrkur ederim atakan beye özellikle teşekkür ederiz hem zamanında hemde sıcacık geliyor yanındaki ikramınız içinde 6. yemek çok güzeldi semih bey harikasınız maşallah size bize her zaman siz getirin ve sufle getirin mükemmel hizmet gayet hızlı geldi hediye sufle için ayrıca teşekkür ederim bilal bey çok nezaketliydi i̇kramımız için teşekkür ederiz 2 sufle borcunuz var 2008-2009 da yediğimiz 7. yemek çok güzeldi semih beyin özel ilgisi için teşekkür ederim ayriyeten de atacan beye de teşekkürler gecenin bir vaktinde açı olması rahat rahat sahur yapiyoruz sayelerinde tesekkurler gerçekten hem sıcak hem bol malzemos pizzalarımız her zamanki gibi sıcak ve hızlı atakan beye 8. yemek çok güzeldi semih beyin özel ilgisi için bilal beye çok teşekkür ediyorum kendisine ve çalışma arkadaşlarına başarılar dilerim bu zamana kadar kötü yemedim ilk kez soğuk tatsız geldi cumali beye özellikle çok teşekkür ederim pizza mükemmeldi mükemmeldi pizza bulls ekibi gerçekten 9. yemek çok güzeldi semih bey harikasınız maşallah size bize her zaman siz getirin ve sufle getirin mükemmel hizmet güleryüzlü çalışanlar pizallar gayet lezzetli servis görevlisi cumali bey e ilgisi için teşekkür ederim her zamanki gibi hızlı ve sıcak geliyor süper hizmet için 10. yemek çok güzeldi semih beye teşekkürler çok sıcak geldi son derece lezzetliydi ayrıca servis de çok güzeldi pizza gerçekten çok ilgili bir işletme kesinlikle öneririm 2 kat cheddarlı yiyin kurye halil kardeşime altunizade şube çalışanlarına teşekkürler kurye maskesizdi bu nedele 1 puan 11. yemek çok güzeldi semih bey harikasınız maşallah size bize her zaman siz getirin ve sufle getirin mükemmel hizmet güleryüzlü çalışanlar pizallar gayet lezzetli servis görevlisi cumali bey e teşekkürler atakan beye çok teşekkürler her zamanki gibi bol malzemeli geldi i̇dare eder i̇kram 12. yemek çok güzeldi semih beye teşekkürler siparişin yanında hediye tatlı içinde çok teşekkür ederim bilal beye teşekkür ederiz her zaman olduğu gibi güzeldi hiz lezzet super ozellikle pizzanin hamuru ve kullandiklari peynir cok kaliteli ve lezzetli oldugunu dusunuyorum ve servis eksiksizdi tek 13. yemek çok güzeldi semih beyin özel ilgisi bu müthiş tadı ve hizmeti daha da sipariş vermem bullstan çok teşekkür ederiz umarım hep böyle devam allah zeval vermesin semih beye teşekkürler her şey yine çok güzeldi ayrıca hediye sufle icin tesekkurler i̇lginize teşekkürler 14. yemek çok lezzetliydi ve her şey harikaydı notlar okunmuştu ayrıca ikram için teşekkürler herşey çok iyiydi pizzanın atakan beye teşekkürler kurye kibardı tesekkurler pizzalarınız muazzam çalışanlarınız çok güler yüzlü bir beyfendiydi lezzet de dahil her şeyiniz güzeldi ellerinize sağlık çok teşekkür ederiz 15. yemek çok lezzetliydi ve getiren arkadasta cok hizli getirdi ve cok nazikti semih beye teşekkürler çok lezzetli hızlı ve sıcak geldi atakan bey e hediyesi için teşekkürler atakan bey çok nazikti her şey çok güzeldi ikram olarak yolladığınız sufle içinde çok teşekkür 16. yemek çok güzeldi semih beye ilgisinden dolay çok teşekkürler atakan bey i̇lgi ve ikramlar için teşekkürler her zamanki gibi sıcak geldi özellikle atakan beyin ilgisine teşekkürler çok nazikdi pizza ve kola artı patates söylemiştim üzüldüm sizi ve yaptığınız yemekleri arkadaşlarımdan daha çok 17. yemek çok güzeldi semih beyin özel ilgisi için ayrıca teşekkürler herşey çok güzeldi herşey lezzetli ve sıcacık geliyor getiren çalışanlar her zaman hızlı hizmet ve hiz superdi tesekkurler ibrahim bey i̇kram için ayrıca teşekkür ederim bu civarın en iyi işletmesi diyebilirim bir 18. yemek çok güzeldi semih beye ilgisinden dolayi tesekkurler lezzetliydi ve her şey çok güzeldi her sey için tesekkurler tarık bey i̇kram sufle için teşekkür ederim i̇lgi ve alakalarından dolayı teşekkürler bilal beye çok teşekkür ederim her zaman hızlı nazik ve güler yüzlüydü 19. yemek çok lezzetliydi ve yanında 2 adet ikram sufle geldi i̇ftar vakti olduğu için lezzetide iyi gelmedi küçücük bişeye 45 tl para verdim yazık oldu pizza çok iyiydi fırat beye teşekkürler i̇lginiz ve alakanız için tşekkür ederim i̇lginize teşekkürler blal bey i̇lk
Convolutional neural networks are used for image classification purposes. For our labels, we are considering if a product name, description, or product category contains the word pizza in it or not.
sql_command = """
SELECT
DISTINCT
FPI."IMAGE_LINK",
FPIS."SOURCE",
CASE WHEN LOWER(EFM."CATEGORY_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_NAME") LIKE '%pizza%' OR LOWER(EFM."PRODUCT_DESCRIPTION") LIKE '%pizza%' THEN 1 ELSE 0 END AS "IS_PIZZA"
FROM "ODS"."EXT_FB_MENU" EFM
INNER JOIN "ODS"."EXT_FB_PRODUCT_IMAGE" FPI ON (EFM."PRODUCT_ID" = FPI."PRODUCT_ID")
INNER JOIN "ODS"."EXT_FB_PRODUCT_IMAGE_SOURCE" FPIS ON (FPI."IMAGE_LINK" = FPIS."IMAGE_LINK");
"""
image_source_df = pd.read_sql(sql_command,conn)
image_source_df
| IMAGE_LINK | SOURCE | IS_PIZZA | |
|---|---|---|---|
| 0 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 1 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 2 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 3 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 4 | https://cdn.yemeksepeti.com/ProductImages/corb... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| ... | ... | ... | ... |
| 4961 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 1 |
| 4962 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
| 4963 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 1 |
| 4964 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 1 |
| 4965 | https://cdn.yemeksepeti.com/restaurant/TR_ISTA... | [b'\xff', b'\xd8', b'\xff', b'\xe0', b'\x00', ... | 0 |
4966 rows × 3 columns
features = []
labels = []
#image_id = []
for i in range(len(image_source_df)):
image_source = byte_image_to_numpy_array(image_source_df.iloc[i]["SOURCE"], 200)
is_pizza = image_source_df.iloc[i]["IS_PIZZA"]
#image_id.append(image_source_df.iloc[i]["IMAGE_LINK"])
features.append(image_source)
labels.append(is_pizza)
#Image.fromarray(features[2381])
features = np.array(features).reshape(-1, 200, 200, 3)
features = features.astype('float32')/255
labels = np_utils.to_categorical(labels, 2)
batch_size = 16
epochs = 3
print("Feature Shape: {f_shape} Label Shape: {l_shape}".format(f_shape=features.shape, l_shape=labels.shape))
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 9)
model = tf.keras.Sequential([
tf.keras.layers.Conv2D(32, (3,3), padding='same', activation=tf.nn.relu,
input_shape=(200, 200, 3)),
tf.keras.layers.MaxPooling2D((2, 2), strides=2),
tf.keras.layers.Conv2D(32, (3,3), padding='same', activation=tf.nn.relu),
tf.keras.layers.MaxPooling2D((2, 2), strides=2),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128, activation=tf.nn.relu),
tf.keras.layers.Dense(2, activation=tf.nn.softmax)
])
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(features_train, labels_train, batch_size = batch_size, epochs = epochs, verbose = 1, validation_data = (features_test, labels_test))
label_prediction = model.predict(features_test)
score = model.evaluate(features_test, labels_test, verbose = 0)
print("""Loss: {loss_score:0.4f} Accuracy: {accuracy:0.2f}%""".format(loss_score=score[0], accuracy=score[1]*100))
Feature Shape: (4966, 200, 200, 3) Label Shape: (4966, 2) Epoch 1/3 218/218 [==============================] - 98s 449ms/step - loss: 0.5891 - accuracy: 0.7463 - val_loss: 0.4245 - val_accuracy: 0.7852 Epoch 2/3 218/218 [==============================] - 94s 431ms/step - loss: 0.3319 - accuracy: 0.8550 - val_loss: 0.2872 - val_accuracy: 0.8812 Epoch 3/3 218/218 [==============================] - 94s 432ms/step - loss: 0.2245 - accuracy: 0.9100 - val_loss: 0.2555 - val_accuracy: 0.9074 Loss: 0.2555 Accuracy: 90.74%
if(not(check_if_file_exists('images/cnn_tp.png') and check_if_file_exists('images/cnn_tn.png') and check_if_file_exists('images/cnn_fp.png') and check_if_file_exists('images/cnn_fn.png'))):
tp, tn, fp, fn = False, False, False, False
#test_row = 456
#test_row = 456
for test_row in range(len(labels_test)):
true_label = labels_test[test_row][1].astype(np.uint8)
prediction = round(label_prediction[test_row][1])
if(true_label == 1 and prediction == 1 and tp == False):
tp = test_row
plt.title('True Positive',fontweight='bold',fontsize=15)
plt.imshow((features_test[tp]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_tp.png')
plt.close()
continue
if(true_label == 0 and prediction == 0 and tn == False):
tn = test_row
plt.title('True Negative',fontweight='bold',fontsize=15)
plt.imshow((features_test[tn]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_tn.png')
plt.close()
continue
if(true_label == 0 and prediction == 1 and fp == False):
fp = test_row
plt.title('False Positive',fontweight='bold',fontsize=15)
plt.imshow((features_test[fp]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_fp.png')
plt.close()
continue
if(true_label == 1 and prediction == 0 and fn == False):
fn = test_row
plt.title('False Negative',fontweight='bold',fontsize=15)
plt.imshow((features_test[fn]*255).astype(np.uint8))
plt.axis('off')
plt.savefig('images/cnn_fn.png')
plt.close()
continue
images = [Image.open(x) for x in ['images/cnn_tp.png', 'images/cnn_tn.png', 'images/cnn_fp.png', 'images/cnn_fn.png']]
widths, heights = zip(*(i.size for i in images))
total_width = sum(widths)
max_height = max(heights)
merged_im = Image.new('RGB', (total_width, max_height))
x_offset = 0
for im in images:
merged_im.paste(im, (x_offset,0))
x_offset += im.size[0]
merged_im